Create New Variables - Variable(s) with Outliers Removed

From Q
Jump to: navigation, search

Create new variable(s) with any outlying values in the selected variable(s) replaced with NaN

This tool checks the selected data for outliers and creates new copies of the data with the outliers removed. Outliers are defined as values that are not within a certain number of standard deviations from the variable mean, and you can choose how many standard deviations are used to determine which values are considered to be outliers. The default value is 3 standard deviations. The new copies of data will have the outlying values replaced with missing values. Data that does not contain outliers will not be copied.

A new folder will be created in the report tree that contains tables for the selected data and any new copies of data with the outliers removed.

The new copies of variables use a JavaScript formula to assign respondents with outlying values with a value of NaN. The means and standard deviations are determined when this script is run. As a result, the definition of an outlier in variables where the outliers have been removed will not be updated if the underlying data changes.

How to apply this QScript

  • Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
  • Click on the QScript when it appears in the QScripts and Rules section of the search results.

OR

  • Select Automate > Browse Online Library.
  • Select this QScript from the list.

Customizing the QScript

This QScript is written in JavaScript and can be customized by copying and modifying the JavaScript.

Customizing QScripts in Q4.11 and more recent versions

  • Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
  • Hover your mouse over the QScript when it appears in the QScripts and Rules section of the search results.
  • Press Edit a Copy (bottom-left corner of the preview).
  • Modify the JavaScript (see QScripts for more detail on this).
  • Either:
    • Run the QScript, by pressing the blue triangle button.
    • Save the QScript and run it at a later time, using Automate > Run QScript (Macro) from File.

Customizing QScripts in older versions

  • Copy the JavaScript shown on this page.
  • Create a new text file, giving it a file extension of .QScript. See here for more information about how to do this.
  • Modify the JavaScript (see QScripts for more detail on this).
  • Run the file using Automate > Run QScript (Macro) from File.

JavaScript

includeWeb("QScript Selection Functions");
includeWeb("QScript Functions to Generate Outputs"); 
includeWeb("QScript Table Functions"); 

 
outliersRemoved()
 
function outliersRemoved() {
    const web_mode = inDisplayr();
    const allowed_types = ["Numeric", "Numeric - Multi", "Numeric - Grid"];
    let questions = selectInputQuestions(allowed_types);
    if (!questions)
        return false;
    if (!areQuestionsValidAndNonEmpty(questions))
        return false;

    // Specify the number of standard deviations that defines an 'outlier'
    let number_sd_from_mean;
    while (isNaN(number_sd_from_mean)) {
        number_sd_from_mean = prompt("Enter the cut-off value to use to identify outliers. Cases whose value is not within this many standard deviations from the mean will be considered outliers.", 3);
        if (isNaN(number_sd_from_mean))
            alert('The cut-off value must be a number.');
    }
    
    // Begin the report
    let paragraphs = ["The tables below show data that has been checked for outliers.",
                      "Test: value is more than " + number_sd_from_mean + " standard deviations from the mean.",
                      "Where the data contains outliers, a copy of the data is made and it is labeled as 'OUTLIERS REMOVED'",
                      " "];
 
    let group;
    // creating the tables in the report tree
    if (!web_mode) {
        group = project.report.appendGroup();
        group.name = "Checked for outliers";    
    }
    
 
    // Check each of the input variables and construct new variables where needed
    let outliers_found = false;
    let outlier_list = [];
    let no_outliers_list = [];
    questions.forEach(function (question) {
        if (!question.isHidden) {
            let data_file = question.dataFile;
            // Make a table for the original question (Q only)
            if (!web_mode) {
                new_table = group.appendTable();
                new_table.primary = question;
                new_table.secondary = "SUMMARY";
                new_table.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];                
            } 
 
            // Check the statistics for each variable in the question to determine
            // if there are any outliers
            let outlier_data = question.variables.map(function (v) {
                return checkVariableForOutliers(v, number_sd_from_mean);
            });
            let outliers = outlier_data.filter(function (obj) {
                return obj.hasOutliers;
            }).length > 0;
 
            // Generate new variables if any of the variables in the question contains outliers
            if (outliers) { 
                outliers_found = true;
                let max_permissable = outlier_data.map(function (obj) {
                    return obj.maxPermissable;
                });
                let min_permissable = outlier_data.map(function (obj) {
                    return obj.minPermissable;
                });
                outlier_list.push(question.name);
                let variables = question.variables;
                let new_variables = [];
                for (let v = 0; v < variables.length; v++) {
                    let v_name = variables[v].name;
                    let expression = "if (" + v_name + " > " + min_permissable[v] + " && " + v_name + " < " + max_permissable[v] +") " + v_name + "; else NaN";
                    try {
                        new_variables.push(question.dataFile.newJavaScriptVariable(expression, false, preventDuplicateVariableName(data_file, v_name + "_noOutliers"), variables[v].label, null));
                    } catch (e) {
                        log("Could not idenitify outliers in " + v_name + ": " + e);
                        return false;
                    }
                }
                data_file.moveAfter(new_variables, question.variables[question.variables.length - 1]);
                let new_question = data_file.setQuestion(preventDuplicateQuestionName(data_file, question.name + " OUTLIERS REMOVED"), question.questionType, new_variables);

                insertAtHoverButtonIfShown(new_question);

                if (!web_mode){
                    new_table = group.appendTable();
                    new_table.primary = new_question;
                    new_table.secondary = "SUMMARY";
                    new_table.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];                         
                } 
            } else {
                no_outliers_list.push(question.name);
            }
        }
    });
    
    if (!web_mode) {        
        if (outliers_found) {
            paragraphs.push("Outliers found in:");
            paragraphs.push("");
            paragraphs = paragraphs.concat(outlier_list);
        } else
            paragraphs.push('No outliers found');

        simpleHTMLReport(paragraphs, "Checked for outliers", group, true, false);

    } else { // In Displayr, just report what was done in a log.
        if (!outliers_found) {
            log("No outliers were detected in the selected data.");
        } else if (no_outliers_list.length > 0) {
            log("Some of the selected variable sets did not contain outliers:\r\n");
            log(no_outliers_list.join("\r\n"))
        }   
    }
    return true;
}

 
// Checks a variable for outliers.
function checkVariableForOutliers(variable, number_sd_from_mean) {

    // Compute standard deviation (note, no weights used)
    let xx = variable.rawValues;
    let n = 0;
    let tot = 0;
    let min = Infinity;
    let max = -Infinity;
    for (let i = 0; i < xx.length; i++)
    {
        if (xx[i] != null && !isNaN(xx[i]))
        {
            tot += xx[i];
            n++;

            if (xx[i] > max)
                max = xx[i];
            if (xx[i] < min)
                min = xx[i];
        }
    }
    let mean = tot/n;
    let tmp_sd = 0;
    for (let i = 0; i < xx.length; i++)
    {
        if (xx[i] != null && !isNaN(xx[i]))
            tmp_sd += (xx[i] - mean) * (xx[i] - mean);
    }
    let sd = Math.sqrt(tmp_sd/(n-1))

    let outliers = false;
    let min_permissable = mean - number_sd_from_mean * sd;
    let max_permissable = mean + number_sd_from_mean * sd;
    let too_low = min < min_permissable;
    let too_high = max > max_permissable;
    if (too_low || too_high)
        outliers = true;
 
    return { hasOutliers: outliers, maxPermissable: max_permissable, minPermissable: min_permissable };
}

See also