Create New Variables - Midpoint Coding and Quantification

From Q
Jump to: navigation, search

This QScript automatically recodes the labels of categorical variables into new numeric variables, quantifying the information contained within the labels of input variable(s).

To apply the same recoding without generating new variables use the QScript Recoding - Midpoint Coding and Quantification instead.

Example

MidpointAge.png

Technical details

This QScript looks for Pick One, Pick One - Multi, Number, and Number - Multi questions that have numbers or ranges of numbers in their labels and:

  • Attempts to set values for the question that correspond to the numbers in the labels for each category, or midpoints thereof.
  • Creates a new Number or Number - Multi question for each.

Where labels contain a single number, this value will be used. If no number is detected in the label, then the value of NaN will be assigned. Recoding will only be applied for questions that have three or more labels containing numbers.

Where the label contains a range of numbers, for example 18 to 24 then the midpoint value will be used (for example 21 in this case). If a question is recoded according to mid-points and it contains a lower label like Less than 18 then the midpoint will be half-way between zero and the number in the label (in this example 9). When the question is recoded according to mid-points and it contains an upper label like 55 or more then the midpoint will be the number in the label plus half of the previous interval - so if the previous interval was 50 to 54 this midpoint will be set to 57. If no midpoint for a label can be determined then a value of NaN will be assigned.

If the labels include any kind of brackets, e.g. [[ or (, then only the text inside the brackets will be used. If there is no closing bracket (the label has been truncated) then everything after the opening bracket will be used.

Labels that contain references to time periods, such as days, weeks, minutes, and hours, or other units like litres or kilograms are difficult to recode in this way. Any such questions will be added to a separate folder in your report.

How to apply this QScript

  • Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
  • Click on the QScript when it appears in the QScripts and Rules section of the search results.

OR

  • Select Automate > Browse Online Library.
  • Select this QScript from the list.

Customizing the QScript

This QScript is written in JavaScript and can be customized by copying and modifying the JavaScript.

Customizing QScripts in Q4.11 and more recent versions

  • Start typing the name of the QScript into the Search features and data box in the top right of the Q window.
  • Hover your mouse over the QScript when it appears in the QScripts and Rules section of the search results.
  • Press Edit a Copy (bottom-left corner of the preview).
  • Modify the JavaScript (see QScripts for more detail on this).
  • Either:
    • Run the QScript, by pressing the blue triangle button.
    • Save the QScript and run it at a later time, using Automate > Run QScript (Macro) from File.

Customizing QScripts in older versions

  • Copy the JavaScript shown on this page.
  • Create a new text file, giving it a file extension of .QScript. See here for more information about how to do this.
  • Modify the JavaScript (see QScripts for more detail on this).
  • Run the file using Automate > Run QScript (Macro) from File.

JavaScript

includeWeb('QScript R Output Functions');
includeWeb('QScript Selection Functions');
includeWeb('QScript Functions to Generate Outputs');
includeWeb("JavaScript Utilities");
includeWeb("JavaScript Array Functions");
includeWeb('QScript Questionnaire Functions');
includeWeb("QScript Table Functions");
includeWeb('QScript Utility Functions');
includeWeb('QScript Value Attributes Functions');
includeWeb('QScript Data Reduction Functions');
includeWeb('QScript Functions for Combining Categories');
includeWeb('QScript Functions for Processing Arrays');


if(!midpointAndQuantificationRecodingCustom(true))
    log("QScript cancelled.");
else
    conditionallyEmptyLog("QScript finished.");

function midpointAndQuantificationRecodingCustom(add_new_vars) {

 
    function labelsReferToUnits(label_array, max_number) {
        return label_array.filter(labelRefersToUnits).length > max_number;
    }
 
    function labelRefersToUnits(label) {
        return containsSubstring(label, ["metres", "litres", "grams", "pound", "stone",])
    }
 
    function labelsReferToRates(label_array, max_number){
        return label_array.filter(labelRefersToRates).length > max_number;
    }
 
    function labelRefersToRates(label) {
        return containsSubstring(label, ["per", "each", "every"]);
    }
 
 
    // Recode the question as appropriate.
    // Return an object containing:
    // - question - the original question
    // - incorrect - if we think the recoding is likely to be incorrect because the labels are ambiguous
    // - midpoints - flag is true if we used midpoint coding
    // - quantified - flag is true if we did not use midpoint coding but instead quantified the labels
    function recodeQuestionByMidpointsAndQuantification(question) {
 
        var coded_by_midpoints = false;
        var could_be_incorrect = false;
        var coded_by_quantification = false;
 
        var labels = valueLabels(question);
 
 
        var current_values = question.uniqueValues;
 
        var midpoints = computeMidpointsInQuestionCustom(question, [" to ", "-"]);
        if (numberOfMidpoints(midpoints) >= 3 ) {
            // Try to recode with midpoints first
            if (labelsReferToTime(labels, 1) || labelsReferToRates(labels, 1) || labelsReferToUnits(labels, 1))
                could_be_incorrect = true;
 
            coded_by_midpoints = true;
 
            current_values.forEach(function (value) {
                setValueForVariablesInQuestion(question, value, getInferredValueFromMidpoints(midpoints, value));
            });
        } else {
            // Code by quantification
 
            // If more than half of the labels contain strings in brackets eg (...)
            // Then grab the string from inside the brackets rather than looking at
            // the full labels/
            var labels_in_brackets = extractStringsFromBrackets(labels);
            var num_in_brackets = labels_in_brackets.filter(function (label) { return label.length > 0; }).length; 
            var quantified_labels;
            if (num_in_brackets > labels.length / 2)
                quantified_labels = labels_in_brackets.map(quantify);
            else
                quantified_labels = labels.map(quantify)
 
            if (labelsReferToTime(labels, 1) || labelsReferToRates(labels, 1) || labelsReferToUnits(labels, 1))
                could_be_incorrect = true;   
 
            coded_by_quantification = true;
 
            for (var k = 0; k < current_values.length; k++)
                setValueForVariablesInQuestion(question, current_values[k], quantified_labels[k]);
        }
 
        return { question: question, incorrect: could_be_incorrect, midpoints: coded_by_midpoints, quantified: coded_by_quantification };
    }
 
 
    // Make a copy of the question in obj, convert it to numeric, and relabel
    // it depending on how it has been recoded in the last step.
    function copyAndConvertToNumeric(obj) {
        var question = obj.question;
        var extra_label = obj.midpoints ? " - RECODED WITH MID-POINTS" : " - RECODED WITH NUMBERS FROM LABELS";
        var new_q = question.duplicate(preventDuplicateQuestionName(question.dataFile, question.name + extra_label));
        if (question.questionType.indexOf("Number") == -1)    
            new_q.questionType = question.questionType == "Pick One" ? "Number" : "Number - Multi";
        var v = new_q.variables;
        if (v.length === 1)
            v[0].label = new_q.name;
        return { question: obj.question, new_question: new_q, incorrect: obj.incorrect, midpoints: obj.midpoints, quantified: obj.quantified };
    }

    // For Number and Number - Multi, do the values match the labels?
    function valuesAndLabelsMatch(q) {
        var value_attributes = q.valueAttributes;
        var unique_values = q.uniqueValues;
        for (var j = 0; j < unique_values.length; j++) {
            var val = value_attributes.getValue(unique_values[j])
            if (!isNaN(val) && val.toString() != value_attributes.getLabel(unique_values[j])) {
                return false;
            }
        }
        return true;
    }

    var web_mode = (!!Q.isOnTheWeb && Q.isOnTheWeb()); 
    var allowed_types = ["Nominal", "Ordinal", "Numeric", "Nominal - Multi", 
        "Ordinal - Multi", "Numeric - Multi"];
    var selected_questions = selectInputQuestions(allowed_types);
    if (!selected_questions)
        return false;

    if (!areQuestionsValidAndNonEmpty(selected_questions))
        return false;
 
    // First recode the original questions
    var recoding_objects = selected_questions.map(recodeQuestionByMidpointsAndQuantification);
 
    // Next create numeric copies of the questions if needed
    if (add_new_vars)
        recoding_objects = recoding_objects.map(copyAndConvertToNumeric);
 
    // Sort the recoded questions and new questions into arrays
    // based on how they've been recoded.
    var midpoint_questions = [];
    var quantified_questions = [];
    var incorrect_questions = [];
    recoding_objects.forEach(function (obj) {
        if (obj.incorrect) {
            incorrect_questions.push(obj.question);
            if (add_new_vars)
                incorrect_questions.push(obj.new_question);
        } else if (obj.midpoints) {
            midpoint_questions.push(obj.question);
            if (add_new_vars)
                midpoint_questions.push(obj.new_question);
        } else {
            quantified_questions.push(obj.question);
            if (add_new_vars)
                quantified_questions.push(obj.new_question);
        }
    })
 
    var log_messages = [];
 
    if (quantified_questions.length + midpoint_questions.length + incorrect_questions.length > 0){
        var recoded_group = project.report.appendGroup();
        recoded_group.name = "Recoded Variables";
        if (quantified_questions.length > 0) {
            var quant_group_name = "Questions recoded with numbers from labels";
            generateSubgroupOfSummaryTablesCustom(quant_group_name, recoded_group, quantified_questions);
            log_messages.push("New questions that have been recoded using numbers from the labels have been added to the folder: '" + quant_group_name + "'.");
        }
        if (midpoint_questions.length > 0) {
            var midpoint_group_name = "Questions recoded with midpoints";
            generateSubgroupOfSummaryTablesCustom(midpoint_group_name, recoded_group, midpoint_questions);
            log_messages.push("New questions that have been recoded using midpoints have been added to the folder: '" + midpoint_group_name + "'.");
        }
 
        if (incorrect_questions.length > 0) {
            var temporal_group = "Possibly incorrectly-recoded variables";
            generateSubgroupOfSummaryTablesCustom(temporal_group, recoded_group, incorrect_questions);
            log_messages.push("Recoded questions whose labels refer to time periods and other units have been added to the folder: " + temporal_group + "'. These recodings should be checked carefully.")
        }
 
        if (midpoint_questions.length > 0 || quantified_questions > 0 || incorrect_questions.length > 0)
            log_messages.push("Please check the Values of the new questions to ensure that you are happy with the computed values. This is done by right-clicking on the table and selecting Values.");
 
        if (!web_mode) {
        	conditionallyEmptyLog(log_messages.join("\r\n"));
        	simpleHTMLReport(log_messages, "Recoded Variables", recoded_group, true, false);
        } else {
            var web_message = "Tables showing the recoded variables have been added to the bottom of the document.";
            if (incorrect_questions.length > 0)
                web_message = web_message + "\r\nSome variables have labels which include time periods and other units of measurement, and as a result the coding may not be correct. The coding for these variables should be checked carefully."
            log(web_message);
        }
        
    }
    return true;
}    
// // Create page and title
//     const pageName = "Most Significant Results"
//     const page = project.report.appendPage('TitleOnly');
//     page.group.moveAfter(page, null);
//     page.name = pageName;
//     var titleText = page.subItems[0];
//     titleText.text = pageName;
    
//     var results_item = page.appendR(r_expression);
//     log("A table showing the most significant results has been added to the top of your Document.");
// }

// computes the midpoints for the value labels of a question.
// The labels are split up by the specified array of delimiters.
function computeMidpointsInQuestionCustom(question, delimiters) {
    var midpoints = [];
    var num_vals = question.uniqueValues.length;
    var last_range = 0;
    var source_values = question.uniqueValues;
    var all_labels = valueLabels(question);
 
    // If more than half of the labels contain strings in brackets eg (...)
    // Then grab the string from inside the brackets rather than looking at
    // the full labels/
    var labels_in_brackets = extractStringsFromBrackets(all_labels);
    var num_in_brackets = labels_in_brackets.filter(function (label) { return label.length > 0; }).length;
    if (num_in_brackets > all_labels.length / 2)
        all_labels = labels_in_brackets; // Enough of the labels contain brackets, so use the contents of the brackets in place of the original labels
 
    all_labels = all_labels.map(function (x) { return x.toLowerCase(); } );
    for (var value_i = 0; value_i < num_vals; value_i++) {
        var source_value = source_values[value_i];
        var label = all_labels[value_i];
        var array_of_values = quantifyArray(labelAsArray(label, delimiters, 2));
        if (containsSubstring(label.toLowerCase(), ["under", "less", "lower", "up to", "below"]) 
            && !isNaN(quantify(label))
            && array_of_values.filter(function (x) { return !isNaN(x); }).length == 1 
            && (value_i == 0 || value_i  == 1 || isNaN(quantify(all_labels[value_i -1])))) //Lower limit rule
            midpoint = quantify(label) / 2.0;
        else if (containsSubstring(label.toLowerCase(), ["over", "more", "greater", "\+", "plus", "above"]) 
            && !isNaN(quantify(label)) 
            && (value_i == num_vals -1 || value_i == num_vals - 2 || isNaN(quantify(all_labels[value_i + 1])))) //Upper limit rule
            midpoint = quantify(label) + last_range / 2.0;
        else 
            var midpoint = computeMidpoint(array_of_values);
        if (isNumber(midpoint)) { 
            midpoints.push({
                sourceValue: source_value,
                inferredValue: midpoint,
                genuineMidPoint: array_of_values.length == 2});
            if (array_of_values.length == 2)
                last_range = Math.abs(array_of_values[1] - array_of_values[0]);
        }
    }
    return midpoints;
}


function generateSubgroupOfSummaryTablesCustom(group_name, within_group, question_array) {
    var web_mode = (!!Q.isOnTheWeb && Q.isOnTheWeb());
    if (!web_mode)
    {
        var num_questions = question_array.length;
        var new_group = within_group.appendGroup();
        new_group.name = group_name;
        var new_table;
        for (var j = 0; j < num_questions; j++) {
            new_table = new_group.appendTable();
            new_table.primary = question_array[j];    
        }
    }
    return new_group;
}

See also