Preliminary Project Setup - Create Tables for Data Checking

From Q
Jump to: navigation, search

This QScript scans through a data file looking for data that likely needs to be examined or corrected, and creates tables showing these tables and highlighting cells that may be of interest.

Technical details

It examines all tables containing non-Text and non-Date data and:

  • Identifies all tables containing cells with sample sizes of less than 30.
  • Identifies all tables containing Don't Know responses.
  • Identifies all tables containing blank labels.
  • Identifies all tables containing options chosen by 5 or fewer people, or, less than 1% respondents
  • Identifies all empty tables
  • Identifies numeric data containing outliers, where a variable contains values that are more than (less than) 3 standard deviations above (below) the mean
  • Identifies questions where the Base n is not the same for all variables

Identified tables have the relevant cells highlighted in yellow (via Rules or Table JavaScript in earlier versions of Q). Data containing outliers will be shown as histograms.

How to apply this QScript

  • Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
  • Click on the QScript when it appears in the QScripts and Rules section of the search results.

OR

  • Select Automate > Browse Online Library.
  • Select this QScript from the list.

Customizing the QScript

This QScript is written in JavaScript and can be customized by copying and modifying the JavaScript.

Customizing QScripts in Q4.11 and more recent versions

  • Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
  • Hover your mouse over the QScript when it appears in the QScripts and Rules section of the search results.
  • Press Edit a Copy (bottom-left corner of the preview).
  • Modify the JavaScript (see QScripts for more detail on this).
  • Either:
    • Run the QScript, by pressing the blue triangle button.
    • Save the QScript and run it at a later time, using Automate > Run QScript (Macro) from File.

Customizing QScripts in older versions

  • Copy the JavaScript shown on this page.
  • Create a new text file, giving it a file extension of .QScript. See here for more information about how to do this.
  • Modify the JavaScript (see QScripts for more detail on this).
  • Run the file using Automate > Run QScript (Macro) from File.

JavaScript

// Read in commonly-used functions from the Wiki
includeWeb('QScript Utility Functions');
includeWeb('QScript Questionnaire Functions');
includeWeb('QScript Selection Functions');
includeWeb('QScript Value Attributes Functions');
includeWeb('QScript Functions to Generate Outputs');
includeWeb('QScript Data Reduction Functions');
includeWeb('QScript Functions for Combining Categories');
includeWeb('QScript Functions for Processing Arrays');
includeWeb('QScript Table Functions');
includeWeb('JavaScript Utilities');
includeWeb('QScript Functions to Generate Outputs');
 
main();
 
function main() {     
    // Ask the user to choose which data file to use (if there are multiple data files)
    var selected_datafile = requestOneDataFileFromProject(false, true);
    var questions = getAllQuestionsByTypes([selected_datafile], ["Pick One","Pick One - Multi",
        "Number","Number - Multi","Number - Grid","Pick Any","Pick Any - Compact","Pick Any - Grid"]);
    var num_questions = questions.length;
    var check_questions = [];
    var outlier_questions = [];
    var inconsistent_base_questions = [];
    var empty_questions = [];
    var display_n = [];
    var display_base_n = [];
 
    var temporary_table = project.report.appendTable();
    var questions_containing_dk = [];
    questions.forEach(function (current_question){
        var q_type = current_question.questionType;
 
        if (q_type.indexOf("Number") == 0)
            if (questionHasOutliers(current_question, 3))
                outlier_questions.push(current_question);
 
        temporary_table.primary = current_question;
        temporary_table.secondary = "SUMMARY";
        var statistics = getStatisticsFromTable(temporary_table, ["Base n", "n","%","Row %"]);
        if (statistics == null) {
            empty_questions.push(current_question);
        } else {
            if (current_question.variables.length > 1 && !statisticIsConstant(statistics["Base n"])) {
                        inconsistent_base_questions.push(current_question);
            } else if (containsDontKnow(current_question) || containsBlankLabels(current_question))
                check_questions.push(current_question);
            else {
                if (statistics == null) {
                    check_questions.push(current_question);
                } else {
                    var not_in_yet = true;
 
 
                    if (not_in_yet && statistics["Base n"] != null &&  minWithNaNasNegativeInfinity(statistics["Base n"]) < 30){
                        not_in_yet = false;
                        display_base_n.push(current_question.name);
                    }
                    if (not_in_yet && statistics["n"] != null &&  minWithNaNasNegativeInfinity(statistics["n"]) < 5){
                        not_in_yet = false;
                        display_n.push(current_question.name);
                    }
                    if (not_in_yet && statistics["%"] != null &&  minWithNaNasNegativeInfinity(statistics["%"]) < 1.0)
                        not_in_yet = false;
 
                    if(!not_in_yet)
                        check_questions.push(current_question);
                }
            }
        }
    });
    temporary_table.deleteItem();
 
    // Specify the table javascript expression to highlight cells
    var expression = '// checks to see if a label contains one of the possible substrings (array)\r\n\
function containsSubstring(label, possible_substrings) {\r\n\
    for (var i in possible_substrings)\r\n\
        if((label).toLowerCase().indexOf(possible_substrings[i]) != -1) // checks if label contains any of the above\r\n\
            return true;\r\n\
    return false;\r\n\
}\r\n\
// checks to see if a label represents a don\'t know\r\n\
function isDontKnow(label) {\r\n\
    if (containsSubstring(label,["dk","d.k.","don\'t know","dont know","unsure","un-sure","not sure","do not know","no idea","N/A","applicable","not use", "never use"]))\r\n\
        return true;\r\n\
    if (label == "NA" || label == "na") //removes labels that are NA\r\n\
            return true;\r\n\
    return false;\r\n\
}\r\n\
// Names of statistics to be checked in each table\r\n\
var stats_to_check = ["Base n", "n","%","Row %"];\r\n\
// Corresponding maximum value for each statistic\r\n\
var upper_limits_for_stats = [30, 5, 1.0, 1.0];\r\n\
var colors = table.cellColors;\r\n\
var num_stats = stats_to_check.length;\r\n\
var num_rows = table.numberRows;\r\n\
var num_cols = table.numberColumns;\r\n\
for (var j = 0; j < num_stats; j++) {\r\n\
    // Check that the stat is available for this table\r\n\
    if (table.availableStatistics.indexOf(stats_to_check[j]) > -1) {\r\n\
        var values = table.get(stats_to_check[j]);\r\n\
        for (var row = 0; row < num_rows; row++)\r\n\
            for (var column = 0; column < num_cols; column++) { \r\n\
                if (values[row][column] < upper_limits_for_stats[j])\r\n\
                    colors[row][column] = "Yellow";\r\n\
            }\r\n\
    }\r\n\
}\r\n\
// Check each cell to see if its row or column label looks like a Dont Know style response\r\n\
var row_labels = table.rowLabels;\r\n\
var col_labels = table.columnLabels;\r\n\
if (num_cols > 1) {\r\n\
    for (var row = 0; row < num_rows; row++)\r\n\
        for (var column = 0; column < num_cols; column++) { \r\n\
            if (isDontKnow(row_labels[row]) || isDontKnow(col_labels[column]))\r\n\
                colors[row][column] = "Yellow";\r\n\
        }\r\n\
} else \r\n\
    for (var row = 0; row < num_rows; row++)\r\n\
        if (isDontKnow(row_labels[row]) || row_labels[row].search(/\\S/) == -1)\r\n\
            colors[row][0] = "Yellow";\r\n\
// Store the modified cell colors.\r\n\
table.cellColors = colors;\r\n\
if (fileFormatVersion() > 8.12)\r\n\
    form.setSummary("Highlight cells for checking")';
 
 
    if (check_questions.length == 0 && outlier_questions.length == 0 && inconsistent_base_questions.length == 0)
        log('No questions have been identified as being likely to require cleaning');
    else {
        var new_group = project.report.appendGroup();
        // Add histograms for outliers
        if (outlier_questions.length > 0) {
            var outlier_group = new_group.appendGroup();
            outlier_group.name = "Data that may contain outliers";
            outlier_questions.forEach(function (q) {
                var new_histogram = outlier_group.appendPlot("Histogram");
                new_histogram.primary = q;
            });
        }
 
         if (empty_questions.length > 0)
            generateSubgroupOfSummaryTables("Empty questions", new_group, empty_questions)
 
 
        // Make a table for each question to be checked. Add the table JavaScript (or Rule)
        var num_questions = check_questions.length;
        new_group.name = "Data to Review";
        var new_table;
        if (fileFormatVersion() > 8.12)
            var highlight_rule = project.rules.newCustomRule(expression);
 
        for (var j = 0; j < num_questions; j++) {
            new_table = new_group.appendTable();
            var current_question = check_questions[j];
            new_table.primary = current_question;
            var question_name = current_question.name;
            if (display_n.indexOf(question_name) != -1)
                addStatisticsToTableIfPossible(new_table, "n");
            if (display_base_n.indexOf(question_name) != -1) {
                addStatisticsToTableIfPossible(new_table, "Base n");
            }
            if (fileFormatVersion() > 8.12)
                new_table.rules.add(highlight_rule);
        }
        if (fileFormatVersion() <= 8.12)
            addTableJavaScriptToTablesInGroup(new_group, expression);
 
 
 
        if (inconsistent_base_questions.length > 0) {
            var base_group = new_group.appendGroup();
            base_group.name = "Data with varying Base n";
            inconsistent_base_questions.forEach(function (q) {
                var new_table = base_group.appendTable();
                new_table.primary = q;
                addStatisticsToTableIfPossible(new_table, "Base n");
                if (fileFormatVersion() > 8.12)
                    new_table.rules.add(highlight_rule);
            });
            if (fileFormatVersion() <= 8.12)
                addTableJavaScriptToTablesInGroup(base_group, expression);
            new_group.moveAfter(base_group, null);
        }
 
 
 
        var message = ["Tables have been created, and any cells containing values that should be reviewed have been marked in yellow.\r\n"];
        if (fileFormatVersion() > 8.12)
            message.push("To remove this formatting select one of the tables, select Automate > Manage Rules, and delete the Rule called \'Highlight cells for checking\'.");
        else
            message.push("To remove this formatting you need to select the tables, select Edit > Table JavaScript and delete the JavaScript.");
        if (inconsistent_base_questions.length > 0) {
            message.push("");
            message.push("Tables where the Base n varies between cells have been added to the folder 'Data with varying Base n'.")
        }
        if (outlier_questions.length > 0) {
            message.push("");
            message.push("Histograms showing questions with outliers have been added to the folder called 'Data that may contain outliers'.");
        }
 
 
        var report = simpleHTMLReport(message, new_group.name, new_group, true)
        conditionallyEmptyLog(message.join("\r\n"));
 
    }
}
 
 
 
 
// Returns true if the input question contains outliers according to the
// specified number of standard deviations from the mean.
function questionHasOutliers(question, number_sd_from_mean) {
    checkQuestionType(question, ["Number", "Number - Multi", "Number - Grid"]);        
    // constructing the outputs
    var temp_table = project.report.appendTable(); 
    temp_table.primary = question;
    //numeric_questions.push(question);
    temp_table.secondary = "SUMMARY";
    temp_table.cellStatistics = ['Average', 'Standard Deviation','Minimum', 'Maximum', 'Base n', 'Missing n'];
 
    var output; 
    try {
        output = temp_table.calculateOutput();       
    } catch (e) {
        temp_table.deleteItem();
        return false; //Question is empty;
    }
 
    var n_columns = question.questionType == "Number - Grid" ? output.numberColumns - 1: 1; //excludes sums
    var n_rows = Math.max(1, output.numberRows - 1);
    // determining if there are outliers
    var outliers = false;
    var min_permissable;
    var max_permissable;
    var v_counter;
    var means = output.get('Average');
    var sds = output.get('Standard Deviation');
    var mins = output.get('Minimum');
    var maxes = output.get('Maximum');
    for (var column = 0; column < n_columns; column++)
        for (var row = 0; row < n_rows; row++) {
            var mean = means[row][column];
            var sd = sds[row][column];
            min_permissable = mean - number_sd_from_mean * sd;
            max_permissable = mean + number_sd_from_mean * sd;
            var too_low = mins[row][column] < min_permissable;
            var too_high = maxes[row][column] > max_permissable;
            if (too_low || too_high)
                outliers = true;
        }
    temp_table.deleteItem();
    return outliers;
}
 
// Checks to see if a question contains a Don't Know option
function containsDontKnow(current_question) {
    current_value_attributes = current_question.valueAttributes;
    current_unique_values = current_question.uniqueValues;
    num_vals = current_unique_values.length;
    labels = valueLabels(current_question);
    for (var k = 0; k < num_vals; k++) {
        if (isDontKnow(labels[k]))
            return true;
    }
    return false;
}
 
// Checks to see if a question has blank labels in the data reduction
function containsBlankLabels(question) {
    var var_labels = question.variables.map(function (v) { return v.label; });
    var attributes = question.valueAttributes;
    var value_labels = question.uniqueValues.map(function (v) { return attributes.getLabel(v); });
    return value_labels.concat(var_labels).filter(function (s) { return s.search(/\S/) == -1; }).length > 0;
}
 
function statisticIsConstant(stat_array) {
    var flat_array = [].concat.apply([], stat_array);
    var min = Math.min.apply(null, flat_array);
    var max = Math.max.apply(null, flat_array);
    return min == max;
}


See also

Personal tools
Namespaces

Variants
Actions
Navigation
Categories
Toolbox