Text Analysis - Advanced - Save Variable(s) - First Category

From Q
Jump to: navigation, search

This QScript, saves the first categories from a selected Automatic Categorization - List of Items, Automatic Categorization - Unstructured Text, Automatic Categorization - Entity Extraction or Automatic Categorization - Advanced - Setup Text Analysis output as a Pick One or Pick One - MultiNominal or Nominal - Multi question.

Technical Details

The variables created from this QScript may become invalid and need to be deleted and recreated if the output from Automatic Categorization - List of Items, Automatic Categorization - Unstructured Text, Automatic Categorization - Entity Extraction or Automatic Categorization - Advanced - Setup Text Analysis has changed, either due to the input text variable being modified or the input settings modified.

This QScript requires version 5.5.1 to be executed.

Code

includeWeb("QScript R Output Functions");

main();

function main() {
    if (Q.fileFormatVersion() < 14.11) // require Q 5.5.1 or after in order to call selected_item.data
    {
        log("Q 5.5.1 or later is required to run this script, please update your version of Q.")
        return false;
    }

    var script_name = "Save Variable(s) - First Category";
    var analysis_name = "First Categories";
    
    var bad_selection_message = "Select a List Categorization, Automatic Categorization, Entity Extraction or Setup Text Analysis output.";
    var web_mode = (!!Q.isOnTheWeb && Q.isOnTheWeb());
 
    var selected_item = getSelectedROutputFromPage(["categorizedlist", "AutomaticCategorization", "EntityExtraction", "wordBag"]);
    if (selected_item === null) {
        log(bad_selection_message);
        return false;
    }

    var data_file = getDataFileFromItemDependants(selected_item);

    if (data_file == null)
    {
        log("'Save variables' cannot be applied to an output with no data file.");
        return false;
    }
    
    // Find last variable, which we will place the generated variables after
    var last_variable = getLastVariable(getVariables(selected_item.dependants(false)));

    if (selected_item.outputClasses.indexOf("categorizedlist") > -1)
    {
        var variable_names = selected_item.data.get("variable.names");

        var expression = "categorizedlist = " + 
                         stringToRName(selected_item.referenceName) + 
                         "\nif (paste0(gsub('\\'', '', categorizedlist$variable.names, fixed = TRUE), collapse = '') != '" +
                         selected_item.data.get("variable.names").join("").replace("'", "") + "')\n" +
                         "    stop('The input text variables used for the Text Analysis have changed " +
                         "and as a result these variables are no longer valid. Please delete these variables and rerun the " +
                         script_name + " script on the Text Analysis output.')\n" +
                         "\nresult <- matrix(NA, nrow = length(categorizedlist$subset), ncol = length(categorizedlist$variable.names))\n" +
                         "result[categorizedlist$subset, ] <- sapply(categorizedlist$transformed.text,\n" +
                         "function(x) if (is.null(x)) NA else x[1])\n" +
                         "result <- data.frame(result, stringsAsFactors = FALSE)\n" +
                         "result <- data.frame(lapply(result, function(x) factor(x, levels = categorizedlist$final.tokens)))\n" +
                         "names(result) <- categorizedlist$variable.names\n" +
                         "result";

        var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " + selected_item.referenceName);
        var new_var_name = new_q_name.replace(" from ", " ").replace(/[^a-zA-Z0-9_@\#\$\\]/g, '_').toLowerCase() + "_";
        new_var_name = randomVariableName(16, new_var_name);

        try {
            var new_r_question = data_file.newRQuestion(expression, new_q_name, new_var_name, last_variable);
        } catch (e) {
            log(analysis_name + " could not be created from this item: " + e);
            return false;
        }

        for (var i = 0; i < new_r_question.variables.length; i++)
        {
            new_r_question.variables[i].label = variable_names[i];
            new_r_question.variables[i].variableType = "Categorical";
        }

        if (new_r_question.variables.length > 1)
            new_r_question.questionType = "Pick One - Multi";
        else
            new_r_question.questionType = "Pick One";
    }
    else if (selected_item.outputClasses.indexOf("AutomaticCategorization") > -1)
    {
        var valid_check = "categorization = " + stringToRName(selected_item.referenceName) + 
                          "\nif (gsub('\\'', '', categorization$text.label, fixed = TRUE) != '" + selected_item.data.get("text.label").toString().replace("'", "") + "')\n" +
                          "    stop('The input text variable used for the Text Analysis has changed " +
                          "and as a result this variable is no longer valid. Please delete this variable and rerun the " +
                          script_name + " script on the Text Analysis output.')\n"
        var existing_cat = selected_item.getInput("formExistingCat");
        var expression = valid_check;
        if (existing_cat !== null) {
            var predicted_from_existing = selected_item.data.get("predicted");
            var predicted_type = selected_item.data.getAttribute("predicted", "class");
            var multiple_input = predicted_type == "data.frame";
            if(multiple_input)
            {
                expression += "predicted <- categorization$predicted\n" +
                              "category.levels <- names(predicted)\n" + 
                              "factor(apply(predicted, 1, function(x) {\n"  +
                              "\tif(all(is.na(x)) || sum(x, na.rm = TRUE) == 0) NA else category.levels[which.max(x)]\n" +
                              "}))";
            }
            else
            {
                expression += "categorization$predicted";
            }
        } else {
            expression += "categorization$categorization";            
        }
        var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " +
                                                          selected_item.data.get("text.label"));
        var new_var_name = new_q_name.replace(" from ", " ").replace(/[^a-zA-Z0-9_@\#\$\\]/g, '_').toLowerCase() + "_";
        new_var_name = randomVariableName(16, new_var_name);

        try {
            var new_r_question = data_file.newRQuestion(expression, new_q_name, new_var_name, last_variable);
        } catch (e) {
            log(analysis_name + " could not be created from this item: " + e);
            return false;
        }
        new_r_question.variables[0].variableType = "Categorical";
           
        new_r_question.questionType = "Pick One";
    }
    else if(selected_item.outputClasses.indexOf("EntityExtraction") > -1)
    {
        var entity_variables = selected_item.data.get("entity.variables");
        if(entity_variables.length === 1 && typeof entity_variables[0] === "boolean") {
            var empty_reason = selected_item.data.get("entity.variables.empty.reason");
            if(empty_reason == "output") {
                var reason = "there are no extracted entities in the output.";
            } else if (empty_reason == "min") {
                var entity_counts = selected_item.data.get("entity.counts");
                var entity_names = selected_item.data.getAttribute("entity.counts", "names");
                var min_cases_to_save = selected_item.data.get("min.cases.to.save");
                var largest_count = entity_counts.reduce(function(x, y) {
                    return (x > y) ? x : y;
                });
                var largest_name = entity_names[entity_counts.indexOf(largest_count)];
                var reason = "the entity type with the most number of extractions, '" + largest_name + "', contains " + largest_count + " named entities. However," +
                    " the minimum number of entities required to save is set at " + min_cases_to_save + ". Set 'Minimum number of cases to save' to " + 
                    largest_count + " or lower to save the extracted named entities as variables.";
            } else if(empty_reason == "remove"){
                var reason = "the only entities in the output have been removed with the user specified remove entities from extraction settings.";
            } else {
                log("Error: unknown reason for no variables to save for this Entity Extraction output");
                return false;
            }
            log("No entity variables have been saved since " + reason);
            return false;
        }
        var entity_type_names = selected_item.data.getAttribute("entity.variables", "names");
        var entity_R_type = selected_item.data.getAttribute("entity.variables", "data.type");
        for(let i = entity_variables.length - 1; i >= 0; i--){
            var r_entity_variables = entity_type_names[i].replace(/ /g, ".").toLowerCase() + ".variables";
            var expression = stringToRName(r_entity_variables) + " <- " + stringToRName(selected_item.referenceName) + '$entity.variables[["' + entity_type_names[i] + '"]]'
            if(entity_R_type[i] === "list") {
                expression += '[, 1]\n';
            } else {
                expression += '\n';
            }
            expression += r_entity_variables;
            var new_q_name = preventDuplicateQuestionName(data_file, entity_type_names[i] + " Entities from " +
                                                      selected_item.data.get("text.label"));
            var new_var_name = new_q_name.replace(" from ", " ").replace(/[^a-zA-Z0-9_@\#\$\\]/g, '_').toLowerCase() + "_";
            new_var_name = randomVariableName(16, new_var_name);
            try {
                var new_r_question = data_file.newRQuestion(expression, new_q_name, new_var_name, last_variable);
            } catch (e) {
                log("Variables could not be created from this item: " + e);
                return false;
            }
            if (!web_mode) {
                var t = selected_item.group.appendTable();
                t.primary = new_r_question;
                project.report.setSelectedRaw([t]);
            }
        }
    }
    else if(selected_item.outputClasses.indexOf("wordBag") > -1)
    {
        var tokenized_text = selected_item.data.get("transformed.tokenized");
        var flattened_tokenized = tokenized_text.flat();
        if(flattened_tokenized.length === 0)
        {
            var base_message = "Variables could not be created from this item, there are no terms in the text after transformation.";
            var min_frequency = selected_item.getInput("formminfreq");
            var extra_message = "";
            if(min_frequency > 1)
            {
                extra_message = " Terms might be found if the minimum frequency control is reduced from " + min_frequency + " to a smaller value.";
            }
            log(base_message + extra_message);
            return false;
        }
        var expression = "wordBag <- " + stringToRName(selected_item.referenceName) + "\n" +
                         "tokenized <- wordBag$transformed.tokenized\n" +
                         "below.min.frequency <- which(table(unlist(lapply(tokenized, unique))) < wordBag$min.frequency)\n" +
                         "if (length(below.min.frequency) != 0)\n" +
                         "{\n" +
                         "\tbelow.min.frequency <- names(below.min.frequency)\n" +
                         "\ttokenized <- lapply(tokenized, function(x) {\n" +
                         "\t\tremove.token <- x %in% below.min.frequency\n" +
                         "\t\tif(any(remove.token)) x <- x[!remove.token]\n" +
                         "\t\t\treturn(x)\n" +
                         "\t\t})\n" +
                         "}\n" +
                         "data.frame(sapply(tokenized, function(x) x[1]), stringsAsFactors = TRUE)"
        var new_q_name = preventDuplicateQuestionName(data_file, analysis_name + " from " +
                                                          selected_item.data.getAttribute("original.text", "label"));
        var new_var_name = new_q_name.replace(" from ", " ").replace(/[^a-zA-Z0-9_@\#\$\\]/g, '_').toLowerCase() + "_";
        new_var_name = randomVariableName(16, new_var_name);

        try {
            var new_r_question = data_file.newRQuestion(expression, new_q_name, new_var_name, last_variable);
        } catch (e) {
            log(analysis_name + " could not be created from this item: " + e);
            return false;
        }
    }
        
    // In Q, select the table showing the new question
    if (!web_mode && selected_item.outputClasses.indexOf("EntityExtraction") < 0) {
        var t = selected_item.group.appendTable();
        t.primary = new_r_question;
        project.report.setSelectedRaw([t]);
    }
    return true;
}